# load dataset from csv file
# load python modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for DataFrames
#import seaborn as sns
%matplotlib inline
def load_data(file_name):
# Load the wholesale dataset
dataset_file = file_name
try:
data = pd.read_csv(dataset_file, delimiter=",")
# temporary drop column 'status' due to type string
status_cl = pd.DataFrame(data['STATUS'], columns = ['STATUS'], dtype=np.str).reset_index(drop=True)
data = data.drop(['STATUS'], 1)
# apply rest of columns converting to float type
data = data.apply(pd.to_numeric, args=('coerce',)).reset_index(drop=True)
data = pd.concat([data, status_cl], axis=1)
data = data.reset_index(drop=True)
print data.dtypes
print "Wholesale flight dataset has {} samples with {} features each.".format(*data.shape)
except:
print "Dataset could not be loaded!"
return data
print "Loading training data..."
learn_data = load_data(r'raw_data.csv').reset_index(drop=True)
print "\nLoading testing data..."
test_data = load_data(r'test_data.csv').reset_index(drop=True)
all_data = pd.concat([learn_data, test_data], axis=0).reset_index(drop=True)
print all_data.columns
# samples of dataset
from IPython.display import display # Allows the use of display() for DataFrames
indices = [1,100, 200, 1073,9999, 32050]
samples = pd.DataFrame(np.round(all_data.loc[indices], 3), columns = all_data.keys()[0:10])
display(samples)
samples = pd.DataFrame(np.round(all_data.loc[indices], 3), columns = all_data.keys()[10:])
display(samples)
display(all_data[all_data.keys()[0:11]].describe().round(3))
display(all_data[all_data.keys()[11:]].describe().round(3))
pd.scatter_matrix(all_data[all_data.keys()[0:6]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
pd.scatter_matrix(all_data[all_data.keys()[6:12]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
pd.scatter_matrix(all_data[all_data.keys()[11:-4]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
def show_data_from_logid(log_id, features, time_interval=1.0):
logid_data = all_data[all_data['LOG_ID'] == log_id]
# process values shifting if necessary
tmp_features = []
for item in features:
if type(item) is tuple:
feature_name = item[0]
if len(item) == 3:
rate = item[2] # scale value
logid_data[feature_name] = logid_data[feature_name] * rate
value_shift = item[1] # shift value
logid_data[feature_name] = logid_data[feature_name] + value_shift
tmp_features.append(feature_name)
else:
tmp_features.append(item)
features = tmp_features
# scale time to seconds
logid_data['TIME'] = logid_data['TIME'] / (0.5e+6)
# prepare x sticks
x_stick = np.arange(logid_data['TIME'].min(), logid_data['TIME'].max(), time_interval)
x_stick = pd.DataFrame(x_stick, columns = ['Time'])
# plot data
fig, ax = plt.subplots(figsize = (14,8))
logid_data.plot(x = 'TIME', y = features,
xticks=x_stick['Time'], kind = 'line', ax=ax, rot=70);
features2use = ['MOTOR1', 'MOTOR2', 'MOTOR3', 'MOTOR4', ('BAR', -100.2e+3, 100.0)]
show_data_from_logid(1,features2use, 1.5)
features2use = ['LIDAR']
show_data_from_logid(1,features2use, 1.5)
features2use = ['ACC_X','ACC_Y', ('ACC_Z', 8.0), 'PITCH', 'ROLL', 'MAG_Z']
show_data_from_logid(1, features2use, 1.5)
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
# create dummies
def preprocess_dumies(X):
outX = pd.DataFrame(index=X.index) # output dataframe, initially empty
for col, col_data in X.iteritems():
# If data type is non-numeric, try to replace all yes/no values with 1/0
if col_data.dtype == object and col=='STATUS':
col_data = pd.get_dummies(col_data, prefix=col) # e.g. 'status' => 'status_flight', 'status_take_off'
outX = outX.join(col_data) # collect column(s) in output dataframe
return outX
print all_data.index
all_data = preprocess_dumies(all_data)
determ_result = None
keys = all_data.keys()
print keys
features2ignore = ['TIME', 'LOG_ID', 'LAND_STATUS', 'index']
def remove_features(all_features, features_to_remove):
# remove features from the list if they exists
new_list = []
for feature in all_features:
if feature in features_to_remove:
continue
else:
new_list.append(feature)
return new_list
# make determinant analysis
determ_result = pd.DataFrame(columns=['Feature', 'Score_value'])
regressor = DecisionTreeRegressor(random_state=333)
row_counter = 0
for key in all_data.keys():
if key not in features2ignore:
# get y data
y_data = all_data[key]
# get x data
x_features = remove_features(all_data.keys(), features2ignore + [key])
x_data = all_data[x_features]
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, random_state = 333)
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
determ_result.loc[row_counter] = [key, score]
row_counter += 1
display(determ_result)
from sklearn import preprocessing
# gravity constant
g = 9.80665
# features to use
#features2use = ['THRUST', 'ACC_X','ACC_Y','ACC_Z', 'MAG_Z', 'PITCH', 'ROLL']
features2use = ['MOTOR1', 'MOTOR2', 'MOTOR3', 'MOTOR4', 'THRUST', \
'ACC_X','ACC_Y','ACC_Z', 'MAG_Z', 'PITCH', 'ROLL']
# set min values for each feature
min_vals = {'MOTOR1': 900, 'MOTOR2': 900, 'MOTOR3': 900, 'MOTOR4': 900,
'THRUST': 1.0,
'ACC_X': 8.0 * g, 'ACC_Y': 8.0 * g, 'ACC_Z': 8.0 * g,
'MAG_Z': 1.3,
'PITCH': np.pi, 'ROLL': np.pi, 'YAW': np.pi
}
# set max values for each feature
max_vals = {'MOTOR1': 1890, 'MOTOR2': 1890, 'MOTOR3': 1890, 'MOTOR4': 1890,
'THRUST': 1.0 + min_vals['THRUST'],
'ACC_X': 8.0 * g + min_vals['ACC_X'], 'ACC_Y': 8.0 * g + min_vals['ACC_Y'], 'ACC_Z': 8.0 * g + min_vals['ACC_Z'],
'MAG_Z': 1.3 + min_vals['MAG_Z'],
'PITCH': np.pi + min_vals['PITCH'], 'ROLL': np.pi + min_vals['ROLL'], 'YAW': np.pi + min_vals['YAW']}
def make_positive(data, features2use):
# make all data positive
data_positive = data.copy()
for key in features2use:
# make negative values as positive
if key in min_vals.keys():
data_positive[key] = data_positive[key] + min_vals[key]
else:
min_val = data_positive[key].min()
if min_val <= 0.0:
data_positive[key] = data_positive[key] + min_val
return data_positive.reset_index(drop=True)
'''
if key in features2use:
if key not in ['MOTOR1', 'MOTOR2', 'MOTOR3', 'MOTOR4']:
data_positive[key] = data_positive[key] + min_vals[key]
else:
min_val = data_positive[key].min()
if min_val <= 0.0:
data_positive[key] = data_positive[key] + min_val
'''
def scale(data, features2scale, min_=0.00001, max_=255.0):
# scale data in a given range
scaled = data.copy()
for key in scaled.keys():
if key in features2scale:
scaled[key] = ((max_ - min_) / (data[key].max() - data[key].min())) * (scaled[key] - data[key].min()) + min_
return scaled.reset_index(drop=True)
data_positive = make_positive(all_data, features2use)
data_scaled = scale(data_positive, features2use)
print "Features before applying: make positive and scaling:"
display(data_positive[features2use].describe())
print "Features with positive values:"
display(data_positive[features2use].describe())
print "Scaled features:"
display(data_scaled[features2use].describe())
pd.scatter_matrix(data_scaled[features2use[0:5]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
pd.scatter_matrix(data_scaled[features2use[6:12]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
def update_dictionary(ind_dic, indices):
# count indices found in outliers for features
for value in indices:
if ind_dic.has_key(value):
ind_dic[value] = ind_dic[value] + 1
else:
ind_dic[value] = 1
def display_indices(ind_dic):
# display how many times sample is detected in the outliers
items = ind_dic.items()
items.sort(key=lambda tup: tup[1], reverse=True)
for item in items:
if item[1] > 1:
print "{:3d}: repeats in {:d} times".format(item[0], item[1])
indices_dic = {} # dictionary for indices in outliers
all_outliers = [] # list of outliers to revoke
for feature in features2use:
# TODO: Calculate Q1 (10th percentile of the data) for the given feature
Q1 = np.percentile(data_scaled[feature], 10.0)
# TODO: Calculate Q3 (90th percentile of the data) for the given feature
Q3 = np.percentile(data_scaled[feature], 90.0)
# TODO: Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
step = 1.7 * (Q3 - Q1)
print "\nData points considered outliers for the feature '{}':".format(feature)
print "Q1-step={:.3f}, Q3+step={:.3f}, step={:.3f}".format(Q1-step, Q3+step, step)
# Display the outliers
cols = [feature, 'LAND_STATUS', 'STATUS_flight', 'STATUS_land', 'STATUS_take_off']
outliers = data_scaled[~((data_scaled[feature] >= Q1 - step) &
(data_scaled[feature] <= Q3 + step)) &
~((data_scaled['STATUS_land'] == 1.0) & (data_scaled['LAND_STATUS'] == 1.0))]
#all_outliers = all_outliers + list(outliers.index)
update_dictionary(indices_dic, list(outliers.index))
#display(outliers[cols])
ALL Motors - OK Thrust outliers - OK ACC_X: 75.0 - 190.0 ACC_Y: 40.0 - 175.0 ACC_Z: 75.0 - 220.0 MAG_Z: 150.0 - Pitch: 80.0 - 245.0 Roll: 25.0 - 125.0
outliers_dict = {
'MOTOR1': (40.868, 250.447),
'MOTOR2': (53.579, 264.316),
'MOTOR3': (23.536, 274.104),
'MOTOR4': (40.489, 259.557),
'THRUST': (34.542, 235.443),
'ACC_X': (75.0, 190.0),
'ACC_Y': (40.0, 175.0),
'ACC_Z': (75.0, 220.0),
'MAG_Z': (150.0, None),
'PITCH': (80.0, 245.0),
'ROLL': (25.0, 125.0)}
def cut_outliers(input_dataset, params):
# cuts the outliers of particular feature in a given range
# get range for each feature
dataset = input_dataset.copy()
columns = input_dataset.columns
indexes = []
for key in params.keys():
if key in columns:
print key
#continue
range_from = params[key][0]
range_to = params[key][1]
if range_from is None:
range_from = -np.inf
print "range_from", range_from
if range_to is None:
print "range_to", range_to
range_to = np.inf
sub_dataset = dataset[~((dataset[key] >= range_from) & (dataset[key] <= range_to))]
dataset= dataset.drop(sub_dataset.index)
# indexes += list(sub_dataset.index)
# print len(indexes)
# indexes = list(np.unique(np.array(indexes, dtype = np.int64)))
# sub_set = dataset.ix[indexes]
return dataset
#print data_scaled.shape
filtered_outliers = cut_outliers(data_scaled, outliers_dict)
display(filtered_outliers.describe())
pd.scatter_matrix(filtered_outliers[features2use[0:6]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
pd.scatter_matrix(filtered_outliers[features2use[6:12]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
Apply logarithm function for all features
def log_scale(dataset):
# scale features using logarithm
log_data = dataset.copy()
for feature in features2use:
log_data[feature] = np.log(log_data[feature])
return log_data
all_log_data = log_scale(filtered_outliers)
display(all_log_data.describe())
pd.scatter_matrix(all_log_data[features2use[0:6]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
pd.scatter_matrix(all_log_data[features2use[6:12]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
from scipy.stats import shapiro
# check data for normal distributed with Shapiro-Wilk test
# this necessary for ICA using
for feature in features2use:
shapiro_w, shapiro_p = shapiro(all_log_data[feature])
print "Feature: {:s}, w={:.3f}, p={:.3f}".format(feature, shapiro_w, shapiro_p)
features2use
from sklearn.decomposition import FastICA
from sklearn.decomposition import FactorAnalysis
# make FA analysis
fa_ = FactorAnalysis(n_components = 6, random_state = 333)
# get only landing data for FA analysis
land_data = all_log_data[all_log_data['STATUS_land'] == 1.0]
land_data = land_data[features2use]
fa_results = fa_.fit_transform(land_data)
def plot_fa_results(data, fa):
# Dimension indexing
dimensions = ['Component {}'.format(i) for i in range(1, len(fa.components_) + 1)]
# FA components
components = pd.DataFrame(np.round(fa.components_, 4), columns = data.keys())
components.index = dimensions
# Create a bar plot visualization
fig, ax = plt.subplots(figsize = (14,8))
# Plot the feature weights as a function of the components
components.plot(ax = ax, kind = 'bar', legend = True, colormap = "Paired");
ax.set_ylabel("Features' unmixed matrix")
ax.set_xticklabels(dimensions, rotation=0)
#patches, labels = ax.get_legend_handles_labels()
#ax.legend(patches, labels, loc='best')
# plot FA components
#col_names = all_log_data[features4use].keys()
fa_data = pd.DataFrame(fa_.components_, columns=features2use)
plot_fa_results(fa_data, fa_)
def get_data_intervals(timeseries, interval=3.0):
# returns intervals of contin.time
interv = []
prev = min(timeseries)
iterval_start = prev
for i in range(1, len(timeseries)):
if timeseries[i] - prev > interval:
interv.append((iterval_start, timeseries[i - 1]))
iterval_start = timeseries[i]
prev = timeseries[i]
interv.append((iterval_start, timeseries[-1]))
return interv
def vizualize_fa(dataset, log_id, fa, features, drone_state=None, use_components=[]):
# vizualize fa analysis for the particular log id
if drone_state is not None:
logid_data = dataset[(dataset['LOG_ID'] == log_id) &
(dataset['STATUS_' + drone_state])].reset_index(drop=True)
title = 'FA analysis: logID={:d}, drone status {:s}'.format(log_id, drone_state)
else:
logid_data = dataset[(dataset['LOG_ID'] == log_id)].reset_index(drop=True)
title = 'FA analysis: logID={:d}, drone status ALL'.format(log_id)
fa_data = fa.transform(logid_data[features])
components_names = ['Component {}'.format(i) for i in range(1, len(fa.components_) + 1)]
fa_logid = pd.DataFrame(fa_data, columns=components_names).reset_index(drop=True)
data_to_plot = pd.concat([logid_data, fa_logid], axis=1).reset_index(drop=True)
#data_to_plot['LAND_STATUS'] = data_to_plot['LAND_STATUS'] - 2.0 # shift from x axis
#data_to_plot['STATUS_land'] = data_to_plot['STATUS_land']# shift from x axis
data_to_plot['TIME'] = data_to_plot['TIME'] / 1.0e+6 # convert to seconds
# get only necessary components
if len(use_components) > 0:
tmp = []
for j in range(0, len(components_names)):
if j in use_components:
tmp.append(components_names[j])
components_names = tmp
#print components_names
necessary_columns = components_names + ['LAND_STATUS', 'STATUS_land']
# check empty time intervals and remove them from plots
time_intervals = get_data_intervals(list(data_to_plot['TIME']))
#if len(time_intervals) == 0:
# time_intervals = [(data_to_plot['TIME'].min(), data_to_plot['TIME'].max())]
#print time_intervals
for time_start, time_stop in time_intervals:
interval_data = data_to_plot[(data_to_plot['TIME'] >= time_start) &
(data_to_plot['TIME'] <= time_stop)].reset_index(drop=True)
# prepare xticks for time ploting
x_ticks = interval_data['TIME']
x_ticks_indexes = range(0, len(x_ticks), int(len(x_ticks) / (6 * 8) + 0.5))
x_ticks = list(np.round(x_ticks[x_ticks_indexes].reset_index(drop=True), 2))
fig, ax = plt.subplots(figsize = (14,8))
if drone_state is None:
y_vals = necessary_columns
else:
y_vals = necessary_columns[:-1]
interval_data.plot(y = y_vals, kind = 'line', x = ['TIME'],
ax=ax, rot=70, xticks=x_ticks, grid = True,
)
ax.set_title(title + ", Time inteval {:.2f}-{:.2f}".format(time_start, time_stop))
vizualize_fa(all_log_data, 19, fa_, features2use, use_components=[3,4])
vizualize_fa(all_log_data, 19, fa_, features2use, drone_state='land', use_components=[3,4])
for log_id in range(0, 25):
vizualize_fa(all_log_data, log_id, fa_, features2use, use_components=[3,4])
vizualize_fa(all_log_data, log_id, fa_, features2use, drone_state='land', use_components=[3,4])
from sklearn.cluster import KMeans
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import silhouette_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
import matplotlib.cm as cm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import svm
def remove_outliers_of_land_status(dataset, time_interval=0.8):
# remove first 0.8 sec data from every log in the dataset in drone state Landing
logid_list = dataset['LOG_ID'].unique()
filtered_dataset = None
# process each log_id
for log_id in logid_list:
# get logid landing data
logid_data = dataset[(dataset['LOG_ID'] == log_id) & (dataset['STATUS_land'] == 1.0)]
# get time intervals
time_intervals = get_data_intervals(list(logid_data['TIME']), 3.0 * 1.0e+6) # 3 sec
#print log_id, np.array(time_intervals) / 1.0e+6
# filter out time interval from begining of landing data
for time_start, time_stop in time_intervals:
time_interval_data = logid_data[(logid_data['TIME'] >= time_start + time_interval * 1.0e+6) &
(logid_data['TIME'] <= time_stop)]
#print time_interval_data.shape
time_interval_data = time_interval_data.reset_index(drop=True)
if filtered_dataset is None:
filtered_dataset = time_interval_data
else:
filtered_dataset = filtered_dataset.reset_index(drop=True)
filtered_dataset = pd.concat([filtered_dataset, time_interval_data])
return filtered_dataset
check_data = remove_outliers_of_land_status(all_log_data)
vizualize_fa(check_data, 19, fa_, features2use)
def measure_accuracy(logid_list, dataset, preds, touch_detect_period=None):
# measure avg recall score, precision and f1 score for data in logs and report recall score for all ids
# for measuring using only landing state of the drone
dataset_ = dataset.copy()
dataset_.reset_index(drop=True, inplace=True)
preds_ = pd.DataFrame(preds, columns=['Preds'])
preds_.reset_index(drop=True, inplace=True)
# concat dataset with preds column
dataset_ = pd.concat([dataset_, preds_], axis=1)
#recall_score = []
#precision_score = []
#f1_score = []
if len(logid_list) == 0:
# use all logs
logid_list = list(dataset_['LOG_ID'].unique())
#print logid_list
# process each log
# init counters
pfp = 0.0 # predicted False Positive
pfn = 0.0 # predicted False Negative
ptn = 0.0 # predicted True Negative
ptp = 0.0 # predicted True Positive
ctp = 0.0 # current True Positive
ctn = 0.0 # current True Negative
for logid in logid_list:
logid_data = remove_outliers_of_land_status(dataset_[dataset_['LOG_ID'] == logid]) # remove first 0.8 sec data from every log
time_intervals = get_data_intervals(list(logid_data['TIME']), 3.0 * 1.0e+6) # 3 sec - theshold of time between intevals, all time intervals in the log
num_intervals = float(len(time_intervals)) # counter, number of time intervals
# process each landing time interval
for time_start, time_stop in time_intervals:
# get time interval data
time_interval_data = logid_data[(logid_data['TIME'] >= time_start) &
(logid_data['TIME'] <= time_stop)]
# get only touch data inside the time interval.
# Strongly that in landing time interval may be ony one touch moment
touch_moment_data = time_interval_data[time_interval_data['LAND_STATUS'] == 1.0]
# check if touching moment exists in the time period data
# determine overlapping of touch moment and clustering results
if touch_moment_data.shape[0] > 0:
ctp += 1.0 # True Positive (inside the touch moment)
# get touch moment's time interval inside the time interval
touch_start_time = touch_moment_data['TIME'].min()
touch_stop_time = touch_moment_data['TIME'].max()
#print "diff", (touch_stop_time - touch_start_time) /1.0e+6
if touch_detect_period is not None:
shift = touch_detect_period * 1.0e+6
#if touch_stop_time - touch_start_time < touch_detect_period:
touch_stop_time = touch_start_time + shift
#print "diff2", (touch_stop_time - touch_start_time) /1.0e+6
touch_moment_data = time_interval_data[(time_interval_data['TIME'] >= touch_start_time) &
(time_interval_data['TIME'] <= touch_stop_time)]
# get time interval's data before the touch moment
data_before_touch_moment = time_interval_data[(time_interval_data['TIME'] < touch_start_time) &
(time_interval_data['TIME'] >= time_start)]
if data_before_touch_moment.shape[0] > 0:
ctn += 1.0 # True Negative
# process only with touch moment data presents in the time period
if data_before_touch_moment['Preds'].sum() >= 1.0:
pfp += 1.0 # predicted before time period (False Positive)
#print "predict: Log", logid, "FP before touch", data_before_touch_moment['Preds'].sum()
else:
ptn += 1.0 # predicted before time period (True Negative)
# get time interval's data after the touch moment
data_after_touch_moment = time_interval_data[(time_interval_data['TIME'] > touch_stop_time) &
(time_interval_data['TIME'] <= time_stop)]
# if data_after_touch_moment.shape[0] > 0:
# ctn += 1.0 # True Negative
# if data_after_touch_moment['Preds'].sum() >= 1.0:
# pfp += 1.0 # predicted after time period (False Positive)
# else:
# ptn += 1.0 # predicted after time period (True Negative)
# check prediction inside touch moment
if touch_moment_data['Preds'].sum() >= 1.0:
ptp += 1.0 # predicted in time period (True Positive)
#print "predict: Log", logid, "TP", touch_moment_data['Preds'].sum()
else:
pfn += 1.0 # predicted in time period (False Negative)
#print "predict: Log", logid, "FN"
else:
# no touch moment in time interval
ctn += 1.0 # True Negative
if time_interval_data['Preds'].sum() >= 1.0:
pfp += 1.0 # predicted in time period (False Positive)
else:
ptn += 1.0 # predicted after time period (True Negative)
tp = ptp / ctp # estimate True Positive rate
fn = pfn / ctp # estimate False Negative rate
#print tp, fn
recall_ = tp / (tp + fn)
#recall_score.append(recall_)
fp = pfp / ctn # estimate False Positive rate
if (tp + fp) == 0.0:
precision_ = 0.0
else:
precision_ = tp / (tp + fp)
#precision_score.append(precision_)
if (precision_ + recall_) == 0.0:
f1_ = 0.0
else:
f1_ = 2 * (precision_ * recall_) / (precision_ + recall_)
#f1_score.append(f1_)
return {'recall': recall_, 'precision':precision_, 'f1':f1_}
import time
def clusterize(clf, title="", touch_detect_period=None, comp_num=5):
# clusterize data, deviding dataset into learning and testin data
log_ids = np.arange(all_log_data['LOG_ID'].max() / 2)
highest_prec = -np.inf
highest_f1 = -np.inf
best_kmeans = None
best_fa_data = None
best_preds = None
split_num = 10
learn_results = pd.DataFrame(columns=['SplitNum',
'F1_train', 'Prec_train', 'Recall_train',
'F1_valid', 'Prec_valid', 'Recall_valid',
'F1_test', 'Prec_test', 'Recall_test',
'Silhouette', 'PredictTime', 'LearningTime', 'AIC', 'BIC'])
counter = 0
spl_num = 1
print "Using:", title
test_log_ids = np.arange(max(log_ids) + 1, 25)
print "Logs to use in testing", test_log_ids
only_test_data = all_log_data[(all_log_data['LOG_ID'].isin(test_log_ids)) &
(all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
only_test_data = remove_outliers_of_land_status(only_test_data)
otest_fa_data = fa_.transform(only_test_data[features2use])
print "Logs to use in learning", log_ids
ss = ShuffleSplit(n_splits=split_num, test_size=0.35, random_state=777)
for train_index, test_index in ss.split(log_ids):
train_data = all_log_data[(all_log_data['LOG_ID'].isin(train_index)) &
(all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
train_data = remove_outliers_of_land_status(train_data)
print "Split id: {:d}".format(spl_num)
label0 = len(train_data[train_data['LAND_STATUS'] == 0.0])
print "Label 0 size in the training set:", label0
print "Label 1 size in the training set:", len(train_data[train_data['LAND_STATUS'] == 1.0])
test_data = all_log_data[(all_log_data['LOG_ID'].isin(test_index)) &
(all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
test_data = remove_outliers_of_land_status(test_data)
tr_fa_data = fa_.transform(train_data[features2use])
tr_fa_data_y = train_data['LAND_STATUS'].reset_index(drop=True)
print "Training...."
#preds_train = clf.fit_predict(tr_fa_data) # Kmeans
#preds_train =
t0 = time.time()
clf.fit(tr_fa_data[:,0:comp_num], tr_fa_data_y)
tlearn = time.time() - t0
preds_train = clf.predict(tr_fa_data[:,0:comp_num])
sil_sc = silhouette_score(X=tr_fa_data, labels=tr_fa_data_y, sample_size=1000);
tr_acc = measure_accuracy([], train_data, preds_train, touch_detect_period=touch_detect_period) # learning accuracy
tst_fa_data = fa_.transform(test_data[features2use])
preds_test = clf.predict(tst_fa_data[:,0:comp_num])
tst_acc = measure_accuracy([], test_data, preds_test) # validation accuracy
t1 = time.time()
preds_otest = clf.predict(otest_fa_data[:,0:comp_num])
t2 = time.time() - t1
test_acc = measure_accuracy([], only_test_data, preds_otest) # testing accuracy
if title=='GMM':
aikbic = [clf.aic(tr_fa_data[:,0:comp_num]), clf.bic(tr_fa_data[:,0:comp_num])]
else:
aikbic = [0.0, 0.0]
learn_results.loc[counter] = [spl_num,
tr_acc['f1'], tr_acc['precision'], tr_acc['recall'],
tst_acc['f1'], tst_acc['precision'], tst_acc['recall'],
test_acc['f1'], test_acc['precision'], test_acc['recall'],
sil_sc, t2, tlearn] + aikbic
counter += 1
spl_num += 1
if test_acc['precision'] > highest_prec:
highest_prec = test_acc['precision']
best_kmeans_prec = clf
best_fa_data = fa_data
best_preds = preds_train
if test_acc['f1'] > highest_f1:
highest_f1 = test_acc['f1']
best_kmeans_f1 = clf
best_fa_data = fa_data
best_preds = preds_train
if title != "GMM":
learn_results.plot(kind='line', x=['SplitNum'], y=['F1_train', 'F1_valid', 'F1_test', 'Silhouette'], title=title)
learn_results.plot(kind='line', x=['SplitNum'], y=['Recall_train', 'Recall_valid', 'Recall_test'], title=title)
learn_results.plot(kind='line', x=['SplitNum'], y=['Prec_train', 'Prec_valid', 'Prec_test'], title=title)
else:
learn_results.plot(kind='line', x=['SplitNum'], y=['AIC', 'BIC'], title=title)
learn_results.plot(kind='line', x=['SplitNum'], y=['F1_train', 'F1_valid', 'F1_test'], title=title)
learn_results.plot(kind='line', x=['SplitNum'], y=['Recall_train', 'Recall_valid', 'Recall_test'], title=title)
learn_results.plot(kind='line', x=['SplitNum'], y=['Prec_train', 'Prec_valid', 'Prec_test'], title=title)
return best_kmeans_f1, best_kmeans_prec, learn_results
from sklearn import mixture
from sklearn.cluster import SpectralClustering
clf_Kmeans = KMeans(n_clusters=2, n_jobs=1, random_state=1111)
clf_GMM = mixture.GaussianMixture(n_components=2, random_state=1111)
clf_spcl = SpectralClustering(n_clusters=2, random_state=1111)
best_f1_r1,_,r1 = clusterize(clf_Kmeans, title="K-Means")
print r1
best_f1_r2,_,r2 = clusterize(clf_GMM, title="GMM")
print r2
#_,_,r3 = clusterize(clf_spcl, title="Spectral Clustering")
#print r3
print r1.describe()
print r1
print r2.describe()
print r2
def show_cluster_results(fa_data, preds, num_clusters, dimensions, true_data=None, title="K-Means"):
# visualize clustering results
components_names = ['Component {}'.format(i) for i in range(1, fa_data.shape[1] + 1)]
fa_data = pd.DataFrame(fa_data, columns=components_names)
if true_data is not None:
true_data = pd.DataFrame(true_data, columns=components_names)
predictions = pd.DataFrame(preds, columns = ['Cluster'])
plot_data = pd.concat([predictions, fa_data], axis = 1)
# Set color map
cmap = cm.get_cmap('gist_rainbow')
for dims in dimensions: # plot for particular dim pairs
# Generate the cluster plot
cluster_A = 'Component {}'.format(dims[0])
cluster_B = 'Component {}'.format(dims[1])
fig, ax = plt.subplots(figsize = (14,8))
#print plot_data.keys()
# Color the points based on assigned cluster
for i, cluster in plot_data.groupby('Cluster'):
cluster.plot(ax = ax, kind = 'scatter', x = cluster_A, y = cluster_B, \
color = cmap((i) * 1.0 / (num_clusters - 1)), label = 'Cluster %i'%(i), s=30);
# Plot clusters
# for i, c in enumerate(range(1, num_clusters + 1)):
# ax.scatter(x = c[fa_data.columns.get_loc(cluster_A)], \
# y = c[fa_data.columns.get_loc(cluster_B)], \
# color = cmap((i) * 1.0 / (num_clusters - 1)), edgecolors = 'black', \
# alpha = 1, linewidth = 2, marker = 'o', s=200);
#
# ax.scatter(x = c[fa_data.columns.get_loc(cluster_A)], \
# y = c[fa_data.columns.get_loc(cluster_B)], marker='$%d$'%(i), alpha = 1, s=100);
if true_data is not None:
true_data.plot(ax = ax, kind = 'scatter', x = cluster_A, y = cluster_B, \
color = 'b', label = 'True data', s=20);
# Set plot title
ax.set_title("{:s} clustering on FA analysis data for N={:d} clusters".format(title, num_clusters));
test_log_ids = np.arange(12, 25)
only_test_data = all_log_data[(all_log_data['LOG_ID'].isin(test_log_ids)) &
(all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
only_test_data = remove_outliers_of_land_status(only_test_data)
otest_fa_data = fa_.transform(only_test_data[features2use])
true_data = only_test_data[only_test_data['LAND_STATUS'] == 1.0]
true_data = fa_.transform(true_data[features2use])
best_preds = best_f1_r1.predict(otest_fa_data[:,0:5])
show_cluster_results(otest_fa_data[:,0:5], best_preds, 2, [[1, 2]], true_data[:,0:5])
show_cluster_results(otest_fa_data[:,0:5], best_preds, 2, [[2, 3]], true_data[:,0:5])
true_data = only_test_data[only_test_data['LAND_STATUS'] == 1.0]
true_data = fa_.transform(true_data[features2use])
best_preds = best_f1_r2.predict(otest_fa_data[:,0:5])
show_cluster_results(otest_fa_data[:,0:5], best_preds, 2, [[1, 2]], true_data[:,0:5], title="GMM")
show_cluster_results(otest_fa_data[:,0:5], best_preds, 2, [[2, 3]], true_data[:,0:5], title="GMM")
train_index = np.arange(0, 12)
train_data = all_log_data[(all_log_data['LOG_ID'].isin(train_index)) &
(all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
train_data = remove_outliers_of_land_status(train_data)
tr_fa_data = fa_.transform(train_data[features2use])
tr_fa_data_y = train_data['LAND_STATUS'].reset_index(drop=True)
params = ['full', 'tied', 'diag', 'spherical']
iters = [100, 200, 300]
finest_aic = np.inf
finest_bic = np.inf
finest_f1 = -np.inf
best_params_aic = []
best_params_bic = []
best_params_f1 = []
for iter_ in iters:
print "Max_iter", iter_
for param in params:
clf_GMM = mixture.GaussianMixture(n_components=2, covariance_type=param, max_iter=iter_, tol=0.1)
clf_GMM.fit(tr_fa_data[:,0:5],tr_fa_data_y)
preds_otest = clf_GMM.predict(otest_fa_data[:,0:5])
#acc = measure_accuracy([], only_test_data, preds_otest, touch_detect_period=0.8)
acc = measure_accuracy([], only_test_data, preds_otest, touch_detect_period=0.8)
aic = clf_GMM.aic(tr_fa_data[:,0:5])
bic = clf_GMM.bic(tr_fa_data[:,0:5])
print "Type = {:s}, F1_score = {:.3f}, Recall={:.3f}, Precision={:.3f}, AIC={:.3f}, BIC={:.3f}".format(param,
acc['f1'],
acc['recall'],
acc['precision'],
aic, bic)
if acc['f1'] > finest_f1:
finest_f1 = acc['f1']
f1_best = clf_GMM
best_params_f1 = [iter_, param, aic]
if aic < finest_aic:
finest_aic = aic
aic_best = clf_GMM
best_params_aic = [iter_, param, aic]
if bic < finest_bic:
finest_bic = bic
bic_best = clf_GMM
best_params_bic = [iter_, param, bic]
print "Best params for AIC", best_params_aic
print "Best params for BIC", best_params_bic
print "Best params for F1", best_params_f1
true_data = only_test_data[only_test_data['LAND_STATUS'] == 1.0]
true_data = fa_.transform(true_data[features2use])
clf_GMM = mixture.GaussianMixture(n_components=2, covariance_type='tied', max_iter=300, tol=0.1)
clf_GMM.fit(tr_fa_data[:,0:5])
preds_final = clf_GMM.predict(otest_fa_data[:,0:5])
show_cluster_results(otest_fa_data[:,0:5], preds, 2, [[1, 2]], true_data=true_data[:,0:5], title='GMM')
clf_GMM = mixture.GaussianMixture(n_components=2, max_iter=200, tol=0.1, covariance_type='tied')
best_f1_final,_,r2_final = clusterize(clf_GMM, title="GMM")
print r2_final
print r2_final.describe()
def visualize_prediction(dataset, preds):
# visualize prediction of clustering
log_ids = dataset['LOG_ID'].unique()
logs_data = dataset.copy()
logs_data.reset_index(drop=True, inplace=True)
predictions = pd.DataFrame(preds, columns = ['Cluster'])
predictions.reset_index(drop=True, inplace=True)
logs_data = pd.concat([logs_data, predictions], axis=1)
#print logs_data.columns
for logid in log_ids:
log_data = logs_data[logs_data['LOG_ID'] == logid]
#print log_data.describe()
#print
log_data['Cluster'] = log_data['Cluster'] + 0.02 # shift by plus one
log_data['TIME'] = log_data['TIME'] / 1.0e+6 # convert to seconds
time_intervals = get_data_intervals(list(log_data['TIME']))
#print time_intervals, logid
for time_start, time_stop in time_intervals:
interval_data = log_data[(log_data['TIME'] >= time_start) &
(log_data['TIME'] <= time_stop)].reset_index(drop=True)
x_ticks = interval_data['TIME']
try:
x_ticks_indexes = range(0, len(x_ticks), int(len(x_ticks) / (6 * 8) + 0.5))
except:
continue
x_ticks = list(np.round(x_ticks[x_ticks_indexes].reset_index(drop=True), 2))
fig, ax = plt.subplots(figsize = (14,2))
interval_data.plot(y = ['LAND_STATUS', 'Cluster'], kind = 'line', x = ['TIME'],
ax=ax, rot=70, xticks=x_ticks, grid = True)
ax.set_title("LogID={:d}, Time inteval {:.2f}-{:.2f}".format(logid, time_start, time_stop))
ax.set_ylim([-0.02,2])
visualize_prediction(only_test_data, preds_final)